






capture program drop big_data_prepare
program define big_data_prepare
	
drop if kwh == 0

capture drop sp_min_date 
capture drop sm_date_ym 
capture drop date_ym


*here I'm going to drop some things I'm not using now
drop sp apct
drop naics   
capture drop _merge


merge m:1 date using "stata data/2015 pdp event dates"

drop if _merge == 2


gen  event_day = 0
replace event_day =1 if _merge == 3
capture drop _merge


merge m:1 sa using  "stata data/PDP extract 2015_0713 data - useful"
drop if _merge == 2
drop _merge

merge m:1 sa using "stata data\TOU_demographic 2 climate"
keep if _merge == 3
drop _merge

merge m:1 sa using "stata data\sa to station geomatch 2014-2015 weather data"
keep if _merge == 3 
drop _merge


merge m:1 stn_call hour date using "public raw data/Weather data 2014-2015 for merge"
keep if _merge == 3
drop lat_all long_all stn_call km_to_nid _merge

compress *

end




foreach num of numlist 1/6{
use "raw data\raw_billing_data_summer_part_`num'", clear
big_data_prepare
save "stata data\hourly data save\Hourly data A0_`num'", replace
}

///
///Now do the January-may and nov-dec stuff
///

foreach num of numlist 1/6{
use "raw data\raw_billing_data_not_summer_part_`num'", clear
big_data_prepare
save "stata data\hourly data save\Hourly data 2014 non pdp months for AC A0_`num'", replace
}



use "stata data\hourly data save\Hourly data A0_1", clear

foreach num of numlist 2/6{
append using "stata data\hourly data save\Hourly data A0_`num'"
}
*

compress *

foreach num of numlist 33/45 {
preserve
keep if time == `num'
save "stata data\\hourly data save\\big data time `num''", replace
restore
}
*


///now make the same cuts for the 2014 non pdp stuff

use "stata data\hourly data save\Hourly data 2014 non pdp months for AC A0_1", clear

foreach num of numlist 2/6{
append using "stata data\hourly data save\Hourly data 2014 non pdp months for AC A0_`num'"
}
*

compress *

foreach num of numlist 33/45 {
preserve
keep if time == `num'
save "stata data\\hourly data save\\big data 2014 non pdp months for AC time `num'", replace
restore
}
*


foreach num of numlist 33/45 {
use "stata data\\hourly data save\\big data time `num''", clear
keep if year == 2014
collapse (sum) kwh, by(sa)
save "stata data\\hourly data save\\just total kwh number `num'", replace
}

use "stata data\\hourly data save\\just total kwh number 33"
foreach num of numlist 34/45 {
append using "stata data\\hourly data save\\just total kwh number `num'",
}
rename kwh total_kwh
save "stata data/larger total_kwh file", replace


//Below do the collapsing for the data I want
foreach num of numlist 33/45 {
use "stata data\\hourly data save\\big data time `num''", clear

replace pdp = 0 if pdp == .

merge m:1 sa using "stata data/said collapsing guide for multiple meters", 

order kwh* group_dup

keep if _merge == 3


//do the return drop analysis
gen return = 0
replace return = 1 if dir == "R"
egen return_drop = max(return), by(group_dup)
drop  return dir

egen double kwh_collapse = sum(kwh) if multiple == 1, by(group_dup date hour)
replace kwh_collapse = kwh if kwh_collapse ==. & multiple == 0 //redundant last two conditions, only need 1, but to be sure
order kwh* group_d

//the temp_f should be about the same. A few times, they varried, so this just replaces temp_f with an avg across the group_dup
egen temp_f_mean = mean(temp_f) if multiple == 1, by(group_dup hour date) 
replace temp_f_mean  = temp_f if temp_f_mean == .
drop temp_f 



drop kwh  sa   
duplicates drop
capture drop _merge 

save "stata data\big data multiple cleaned time `num'", replace

}
*

///Run it again for the non pdp stuff 2014 for AC

foreach num of numlist 33/45 {
use "stata data\\hourly data save\\big data 2014 non pdp months for AC time `num'", clear
*keep if date == mdy(8,18,2015) & hour == 17

replace pdp = 0 if pdp == .

merge m:1 sa using "stata data/said collapsing guide for multiple meters", 

order kwh* group_dup

keep if _merge == 3


//do the return drop analysis
gen return = 0
replace return = 1 if dir == "R"
egen return_drop = max(return), by(group_dup)
drop  return dir

egen double kwh_collapse = sum(kwh) if multiple == 1, by(group_dup date hour)
replace kwh_collapse = kwh if kwh_collapse ==. & multiple == 0 //redundant last two conditions, only need 1, but to be sure
order kwh* group_d

//the temp_f should be about the same. A few times, they varried, so this just replaces temp_f with an avg across the group_dup
egen temp_f_mean = mean(temp_f) if multiple == 1, by(group_dup hour date) 
replace temp_f_mean  = temp_f if temp_f_mean == .
drop temp_f 



drop kwh  sa   
duplicates drop
capture drop _merge 

save "stata data\big 2014 non pdp months for AC data multiple cleaned time `num'", replace

}
*



//do structural winners stuff now 

foreach num of numlist 33/45 {
use sa event_day hour kwh sa climate date pdp optionally using  "stata data\big data multiple cleaned time `num'", clear 

egen pdp_always_on = max(pdp), by(sa)
gen year = year(date)
keep if year == 2014

drop pdp

gen event_hours = 0
replace event_hours = 1 if event_day == 1 & inrange(hour,14,17)


collapse (sum) kwh ,by(sa climate event_hours pdp_always_on optionally) 

save "stata data/struct winner collapse time `num'", replace
}
*

///strucutural winner calcuationss

use "stata data/struct winner collapse time 33", clear
foreach num of numlist 34/45{
append using "stata data//struct winner collapse time `num'"
}
*

duplicates drop event_hour sa, force ///there are for some reason a few duplicates at the event_hour sa level
//I checked all 44 by hand, seem to be fine


rename kwh_collapse kwh
drop climate
duplicates drop sa_new pdp event if sa == 5840987738, force //one strange thing happened here
rename sa_new sa
		
merge m:1 sa using "stata data/TOU_demographic sm_date"
keep if _merge == 3

gen days_before = -1*(sm_date - 18870)
capture drop bin
gen bin = round((days_before+4)/7)
replace bin = bin -1 if bin <=0
drop _merge		

reshape wide kwh, i(sa) j(event)
rename kwh0 kwh_non_event
rename kwh1 kwh_event

gen eligible = 0
replace eligible = 1 if bin >=0


gen non_event_savings = .00977 * kwh_non_event
gen event_pdp_charges = .60 * kwh_event

gen money_saved = non_event_savings - event_pdp_charges
egen savings_bin= cut(money_saved), at(-400 (10) 400)

gen struct_winner = 0
replace struct_winner = 1 if money_saved > 0

gen total_kwh = kwh_non_event + kwh_event

gen scaled_savings = 1000* money_saved / total_kwh

preserve
keep sa total_kwh struct_winner
save "stata data/struct winner info with total kwh", replace

restore

save "stata data/structural winner 2014 raw calcultations", replace
		


		
capture program drop make_m8_to_8_dataset_program
program define make_m8_to_8_dataset_program

capture rename sa_new sa
order sa
rename kwh_collapse kwh
rename temp_f_mean temp_f

drop if return_drop == 1
drop return_drop


merge m:1 sa using "stata data/TOU_demographic sm_date"
keep if _merge == 3

gen days_before = -1*(sm_date - 18870)
capture drop bin
gen bin = round((days_before+4)/7)
replace bin = bin -1 if bin <=0
drop _merge		
drop sm_date days_before

end
		

use "stata data\big data multiple cleaned time 38", clear 

foreach num of numlist 39/41 {
append using "stata data\big data multiple cleaned time `num'"
}

make_m8_to_8_dataset_program
keep if inrange(bin,-8,8)
compress *

drop if optionally_enrolled  == 1


save "stata data/A1 data -8 to 8 HUGE", replace




use "stata data\big data multiple cleaned time 33", clear 
foreach num of numlist 34/45 {
append using "stata data\big data multiple cleaned time `num'"
}
drop many_meter_issue share_facility_big_meter multiple_collapse_ok group_dup
keep if year == 2014
make_m8_to_8_dataset_program
keep if inrange(bin,-27,27)
compress *


save "stata data/A1 data -27 to 27 HUGE 2014 only", replace
